# Add any packages you want in this chunk:
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
We are going to look at the property prices for Orange County. From there we are going to try and predict the sales price by linear regression.
Bring in the data and make sure the data types are correct. If not, make the proper changes. The file is located within this project. data/prop_prices_reduced.csv
prop_prices <- read.csv("data\\prop_prices_reduced.csv")
head(prop_prices)
## sale_def bed bath area_heated area dist_cbd dist_lakes pool
## 1 88142 4 2 1270 3631 5211.83 638.57 0
## 2 78046 2 2 1037 1307 14706.67 198.90 0
## 3 273777 4 2 2821 10309 20897.89 3799.20 0
## 4 229185 4 3 2341 9578 7057.41 193.39 0
## 5 464029 4 3 2981 12287 7771.70 225.97 1
## 6 109152 3 2 1307 10088 18662.69 496.94 1
summary(prop_prices)
## sale_def bed bath area_heated
## Min. : 15296 Min. :1.000 Min. :1.000 Min. : 650
## 1st Qu.: 115565 1st Qu.:3.000 1st Qu.:2.000 1st Qu.: 1387
## Median : 151085 Median :3.000 Median :2.000 Median : 1744
## Mean : 200319 Mean :3.398 Mean :2.248 Mean : 1955
## 3rd Qu.: 213902 3rd Qu.:4.000 3rd Qu.:2.500 3rd Qu.: 2301
## Max. :7629992 Max. :7.000 Max. :8.000 Max. :10796
## area dist_cbd dist_lakes pool
## Min. : 1307 Min. : 598.2 Min. : 16.74 Min. :0.000
## 1st Qu.: 6498 1st Qu.: 9701.7 1st Qu.: 314.86 1st Qu.:0.000
## Median : 8733 Median :14552.0 Median : 638.82 Median :0.000
## Mean : 10565 Mean :14382.9 Mean :1403.54 Mean :0.229
## 3rd Qu.: 11250 3rd Qu.:19369.2 3rd Qu.:1735.35 3rd Qu.:0.000
## Max. :269496 Max. :36107.6 Max. :8206.79 Max. :1.000
prop_prices$lake_front <- prop_prices$dist_lakes
prop_prices$lake_front[prop_prices$lake_front < 100] = 1
prop_prices$lake_front[prop_prices$lake_front > 100] = 0
prop_prices$downtown <- prop_prices$dist_cbd
prop_prices$downtown[prop_prices$downtown < 5280] = 1
prop_prices$downtown[prop_prices$downtown > 5280] = 0
prop_prices$luxury = 0
prop_prices$luxury[prop_prices$bed > 4 & prop_prices$area_heated > 2500 & prop_prices$pool == 1] = 1
# prop_prices$bed <- factor(prop_prices$bed)
head(prop_prices)
## sale_def bed bath area_heated area dist_cbd dist_lakes pool lake_front
## 1 88142 4 2 1270 3631 5211.83 638.57 0 0
## 2 78046 2 2 1037 1307 14706.67 198.90 0 0
## 3 273777 4 2 2821 10309 20897.89 3799.20 0 0
## 4 229185 4 3 2341 9578 7057.41 193.39 0 0
## 5 464029 4 3 2981 12287 7771.70 225.97 1 0
## 6 109152 3 2 1307 10088 18662.69 496.94 1 0
## downtown luxury
## 1 1 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 0
summary(prop_prices)
## sale_def bed bath area_heated
## Min. : 15296 Min. :1.000 Min. :1.000 Min. : 650
## 1st Qu.: 115565 1st Qu.:3.000 1st Qu.:2.000 1st Qu.: 1387
## Median : 151085 Median :3.000 Median :2.000 Median : 1744
## Mean : 200319 Mean :3.398 Mean :2.248 Mean : 1955
## 3rd Qu.: 213902 3rd Qu.:4.000 3rd Qu.:2.500 3rd Qu.: 2301
## Max. :7629992 Max. :7.000 Max. :8.000 Max. :10796
## area dist_cbd dist_lakes pool
## Min. : 1307 Min. : 598.2 Min. : 16.74 Min. :0.000
## 1st Qu.: 6498 1st Qu.: 9701.7 1st Qu.: 314.86 1st Qu.:0.000
## Median : 8733 Median :14552.0 Median : 638.82 Median :0.000
## Mean : 10565 Mean :14382.9 Mean :1403.54 Mean :0.229
## 3rd Qu.: 11250 3rd Qu.:19369.2 3rd Qu.:1735.35 3rd Qu.:0.000
## Max. :269496 Max. :36107.6 Max. :8206.79 Max. :1.000
## lake_front downtown luxury
## Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :0.000 Median :0.000
## Mean :0.055 Mean :0.089 Mean :0.035
## 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.000
## Max. :1.000 Max. :1.000 Max. :1.000
Plot histograms for all variables. Additionally, add scatterplots for the relationships between all quantitative variables.
hist(prop_prices$sale_def)
hist(prop_prices$bed)
hist(prop_prices$bath)
hist(prop_prices$area_heated)
hist(prop_prices$area)
hist(prop_prices$dist_cbd)
hist(prop_prices$dist_lakes)
hist(prop_prices$pool)
hist(prop_prices$lake_front)
hist(prop_prices$downtown)
** Sale price against all quantitative variables
plot_ly(prop_prices, y = ~sale_def, x = ~area, type = 'scatter')
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prop_prices, y = ~sale_def, x = ~area_heated, type = 'scatter')
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prop_prices, y = ~sale_def, x = ~bed, type = 'scatter')
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prop_prices, y = ~sale_def, x = ~dist_cbd, type = 'scatter')
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prop_prices, y = ~sale_def, x = ~dist_lakes, type = 'scatter')
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prop_prices, y = ~sale_def, x = ~bath, type = 'scatter')
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot(prop_prices$area_heated, prop_prices$sale_def)
abline(lm(sale_def ~ area_heated, data=prop_prices))
plot(prop_prices$area, prop_prices$sale_def)
abline(lm(sale_def ~ area, data=prop_prices))
plot(prop_prices$bed, prop_prices$sale_def)
abline(lm(sale_def ~ bed, data=prop_prices))
plot(prop_prices$dist_cbd, prop_prices$sale_def)
abline(lm(sale_def ~ dist_cbd, data=prop_prices))
plot(prop_prices$dist_lakes, prop_prices$sale_def)
abline(lm(sale_def ~ dist_lakes, data=prop_prices))
plot(prop_prices$bath, prop_prices$sale_def)
abline(lm(sale_def ~ bath, data=prop_prices))
Provide basic summary statistics for univariate analysis. Also, provide the correlation between all the quantitative variables.
summary(prop_prices)
## sale_def bed bath area_heated
## Min. : 15296 Min. :1.000 Min. :1.000 Min. : 650
## 1st Qu.: 115565 1st Qu.:3.000 1st Qu.:2.000 1st Qu.: 1387
## Median : 151085 Median :3.000 Median :2.000 Median : 1744
## Mean : 200319 Mean :3.398 Mean :2.248 Mean : 1955
## 3rd Qu.: 213902 3rd Qu.:4.000 3rd Qu.:2.500 3rd Qu.: 2301
## Max. :7629992 Max. :7.000 Max. :8.000 Max. :10796
## area dist_cbd dist_lakes pool
## Min. : 1307 Min. : 598.2 Min. : 16.74 Min. :0.000
## 1st Qu.: 6498 1st Qu.: 9701.7 1st Qu.: 314.86 1st Qu.:0.000
## Median : 8733 Median :14552.0 Median : 638.82 Median :0.000
## Mean : 10565 Mean :14382.9 Mean :1403.54 Mean :0.229
## 3rd Qu.: 11250 3rd Qu.:19369.2 3rd Qu.:1735.35 3rd Qu.:0.000
## Max. :269496 Max. :36107.6 Max. :8206.79 Max. :1.000
## lake_front downtown luxury
## Min. :0.000 Min. :0.000 Min. :0.000
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000
## Median :0.000 Median :0.000 Median :0.000
## Mean :0.055 Mean :0.089 Mean :0.035
## 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.000
## Max. :1.000 Max. :1.000 Max. :1.000
cor(prop_prices)
## sale_def bed bath area_heated area
## sale_def 1.00000000 0.32557109 0.58731596 0.69201080 0.34354392
## bed 0.32557109 1.00000000 0.64449531 0.66911599 0.10437035
## bath 0.58731596 0.64449531 1.00000000 0.83255359 0.19298319
## area_heated 0.69201080 0.66911599 0.83255359 1.00000000 0.30184325
## area 0.34354392 0.10437035 0.19298319 0.30184325 1.00000000
## dist_cbd 0.05263700 0.23328277 0.24220637 0.25494517 0.08648327
## dist_lakes -0.08857844 0.03747888 -0.02109154 -0.04092947 -0.15432291
## pool 0.27749823 0.31299930 0.38722523 0.43899880 0.19200110
## lake_front 0.22917369 0.05615679 0.13473393 0.16065033 0.25378323
## downtown -0.01388353 -0.18866237 -0.18778515 -0.14152623 -0.04345479
## luxury 0.34714139 0.41391226 0.41047700 0.42886762 0.16275070
## dist_cbd dist_lakes pool lake_front downtown
## sale_def 0.05263700 -0.08857844 0.27749823 0.22917369 -0.01388353
## bed 0.23328277 0.03747888 0.31299930 0.05615679 -0.18866237
## bath 0.24220637 -0.02109154 0.38722523 0.13473393 -0.18778515
## area_heated 0.25494517 -0.04092947 0.43899880 0.16065033 -0.14152623
## area 0.08648327 -0.15432291 0.19200110 0.25378323 -0.04345479
## dist_cbd 1.00000000 0.26520451 0.11442899 -0.02851741 -0.55941934
## dist_lakes 0.26520451 1.00000000 -0.04295257 -0.18880720 -0.18538695
## pool 0.11442899 -0.04295257 1.00000000 0.06686163 -0.08676410
## lake_front -0.02851741 -0.18880720 0.06686163 1.00000000 -0.01378706
## downtown -0.55941934 -0.18538695 -0.08676410 -0.01378706 1.00000000
## luxury 0.11675950 -0.05119785 0.34944579 0.09725965 -0.04041648
## luxury
## sale_def 0.34714139
## bed 0.41391226
## bath 0.41047700
## area_heated 0.42886762
## area 0.16275070
## dist_cbd 0.11675950
## dist_lakes -0.05119785
## pool 0.34944579
## lake_front 0.09725965
## downtown -0.04041648
## luxury 1.00000000
Run a regression with all the variables included. Print results of the regression.
simple_reg = glm(sale_def ~ bed + bath + area + area_heated + dist_cbd + dist_lakes + pool + lake_front + downtown + luxury, data=prop_prices)
summary(simple_reg)
##
## Call:
## glm(formula = sale_def ~ bed + bath + area + area_heated + dist_cbd +
## dist_lakes + pool + lake_front + downtown + luxury, data = prop_prices)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -745210 -53773 9862 65030 4866204
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.556e+04 3.571e+04 -0.716 0.474334
## bed -9.121e+04 1.096e+04 -8.322 2.84e-16 ***
## bath 4.902e+04 1.500e+04 3.267 0.001123 **
## area 3.028e+00 6.006e-01 5.042 5.48e-07 ***
## area_heated 2.382e+02 1.425e+01 16.717 < 2e-16 ***
## dist_cbd -5.519e+00 1.260e+00 -4.381 1.31e-05 ***
## dist_lakes 3.452e+00 3.807e+00 0.907 0.364731
## pool -3.886e+04 1.651e+04 -2.354 0.018761 *
## lake_front 9.549e+04 2.816e+04 3.391 0.000723 ***
## downtown 8.418e+03 2.615e+04 0.322 0.747586
## luxury 1.532e+05 3.835e+04 3.994 6.98e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 37029720732)
##
## Null deviance: 8.3182e+13 on 999 degrees of freedom
## Residual deviance: 3.6622e+13 on 989 degrees of freedom
## AIC: 27186
##
## Number of Fisher Scoring iterations: 2
coef(simple_reg)
## (Intercept) bed bath area area_heated
## -25557.393538 -91214.859441 49022.220000 3.028256 238.178270
## dist_cbd dist_lakes pool lake_front downtown
## -5.519482 3.452146 -38857.531660 95492.372704 8418.297692
## luxury
## 153157.610459
Which of the variables tested significant at the 95% level? Looking at the results and answering outside of the chunk is sufficient.
Answer: Pool tested significant at the 95% confidence level.
As is, are any of the Gauss-Markov assumptions violated? If so, which ones? How can you fix the issues?
Using the plot of the regression model to test for Guass-Markov violations.
The Residuals vs Fitted plot shows that there is increasing variance as price increases indicating bias which violates assumption 1. This plot also show that there is no linearity which is also a violation.
The Q-Q plot does not show a diagnol line indicating that the data is not normally distributed.
plot(simple_reg)
Based off of your findings in the previous section, make changes to the variables, the functional form, etc.
# Try removing outliers
normalized <- subset(prop_prices, prop_prices$sale_def < 1000000)
# Use only the most significant variables in the formula
fixed_reg = glm(sale_def ~ bed + area_heated + lake_front + luxury, data=normalized)
summary(fixed_reg)
##
## Call:
## glm(formula = sale_def ~ bed + area_heated + lake_front + luxury,
## data = normalized)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -261811 -33253 -3773 22079 588465
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8472.054 10191.716 0.831 0.4060
## bed -27164.364 3827.722 -7.097 2.44e-12 ***
## area_heated 137.089 3.938 34.816 < 2e-16 ***
## lake_front 45965.092 9714.313 4.732 2.55e-06 ***
## luxury 28550.211 13518.349 2.112 0.0349 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 4530760193)
##
## Null deviance: 1.3150e+13 on 990 degrees of freedom
## Residual deviance: 4.4673e+12 on 986 degrees of freedom
## AIC: 24853
##
## Number of Fisher Scoring iterations: 2
coef(fixed_reg)
## (Intercept) bed area_heated lake_front luxury
## 8472.0537 -27164.3638 137.0894 45965.0920 28550.2113
plot(fixed_reg)
# Looking at the plots, it seems like a better fit
# Try using log of large values to even the playing field
fixed_reg2 = glm(log(sale_def) ~ bed + log(area_heated) + lake_front + luxury, data=normalized)
summary(fixed_reg2)
##
## Call:
## glm(formula = log(sale_def) ~ bed + log(area_heated) + lake_front +
## luxury, data = normalized)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.68626 -0.16516 -0.00893 0.15101 1.12932
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.00553 0.22811 13.176 < 2e-16 ***
## bed -0.08843 0.01698 -5.209 2.31e-07 ***
## log(area_heated) 1.23604 0.03515 35.166 < 2e-16 ***
## lake_front 0.16225 0.04077 3.979 7.42e-05 ***
## luxury 0.13410 0.05606 2.392 0.0169 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.07978786)
##
## Null deviance: 252.224 on 990 degrees of freedom
## Residual deviance: 78.671 on 986 degrees of freedom
## AIC: 313.7
##
## Number of Fisher Scoring iterations: 2
coef(fixed_reg2)
## (Intercept) bed log(area_heated) lake_front
## 3.0055337 -0.0884263 1.2360374 0.1622502
## luxury
## 0.1340966
plot(fixed_reg2)
Based on the following inputs, predict the deflated sales price:
# Using basic model
new_house_df <- data.frame(bed=2,bath=2,area_heated=1223,area=9750,dist_cbd=19368,dist_lakes=490,pool=0,lake_front=0,luxury=0)
basic_linear_model <- lm(formula = sale_def ~ bed + bath + area + area_heated + dist_cbd +
dist_lakes + pool, data = prop_prices)
predict(basic_linear_model, newdata=new_house_df)
## 1
## 100339.2
# Now using fixed models
predict(fixed_reg, newdata=new_house_df)
## 1
## 121803.7
# Need to take e^predication since fixed_reg2 is based on logs
exp(predict(fixed_reg2, newdata=new_house_df))
## 1
## 110829.5